Set Up

library(tidyverse)
library(here)
library(hrbrthemes)
library(janitor)
library(corrplot)

RNGkind(sample.kind = "Rounding")
set.seed(1)

theme_set(theme_ipsum())
credit <- as_tibble(read_csv(here("data", "creditcard.csv")))

EDA

head(credit)

Variable Summary

anyNA(credit)
[1] FALSE
skim(credit)
── Data Summary ────────────────────────
                           Values
Name                       credit
Number of rows             284807
Number of columns          31    
_______________________          
Column type frequency:           
  factor                   1     
  numeric                  30    
________________________         
Group variables            None  

── Variable type: factor ────────────────────────────────────────────────────────────────────────────────────────────────────
  skim_variable n_missing complete_rate ordered n_unique top_counts           
1 Class                 0             1 FALSE          2 Non: 284315, Fra: 492

── Variable type: numeric ───────────────────────────────────────────────────────────────────────────────────────────────────
   skim_variable n_missing complete_rate      mean        sd      p0        p25         p50         p75      p100 hist 
 1 Time                  0             1  9.48e+ 4 47488.       0    54202.     84692       139320.     172792    ▃▇▅▆▇
 2 V1                    0             1  1.17e-15     1.96   -56.4     -0.920      0.0181       1.32        2.45 ▁▁▁▁▇
 3 V2                    0             1  3.12e-16     1.65   -72.7     -0.599      0.0655       0.804      22.1  ▁▁▁▇▁
 4 V3                    0             1 -1.36e-15     1.52   -48.3     -0.890      0.180        1.03        9.38 ▁▁▁▁▇
 5 V4                    0             1  2.11e-15     1.42    -5.68    -0.849     -0.0198       0.743      16.9  ▂▇▁▁▁
 6 V5                    0             1  9.80e-16     1.38  -114.      -0.692     -0.0543       0.612      34.8  ▁▁▁▇▁
 7 V6                    0             1  1.51e-15     1.33   -26.2     -0.768     -0.274        0.399      73.3  ▁▇▁▁▁
 8 V7                    0             1 -5.42e-16     1.24   -43.6     -0.554      0.0401       0.570     121.   ▁▇▁▁▁
 9 V8                    0             1  1.03e-16     1.19   -73.2     -0.209      0.0224       0.327      20.0  ▁▁▁▇▁
10 V9                    0             1 -2.42e-15     1.10   -13.4     -0.643     -0.0514       0.597      15.6  ▁▁▇▁▁
11 V10                   0             1  2.23e-15     1.09   -24.6     -0.535     -0.0929       0.454      23.7  ▁▁▇▁▁
12 V11                   0             1  1.71e-15     1.02    -4.80    -0.762     -0.0328       0.740      12.0  ▁▇▁▁▁
13 V12                   0             1 -1.24e-15     0.999  -18.7     -0.406      0.140        0.618       7.85 ▁▁▁▇▁
14 V13                   0             1  8.35e-16     0.995   -5.79    -0.649     -0.0136       0.663       7.13 ▁▃▇▁▁
15 V14                   0             1  1.23e-15     0.959  -19.2     -0.426      0.0506       0.493      10.5  ▁▁▁▇▁
16 V15                   0             1  4.84e-15     0.915   -4.50    -0.583      0.0481       0.649       8.88 ▁▇▂▁▁
17 V16                   0             1  1.43e-15     0.876  -14.1     -0.468      0.0664       0.523      17.3  ▁▁▇▁▁
18 V17                   0             1 -3.78e-16     0.849  -25.2     -0.484     -0.0657       0.400       9.25 ▁▁▁▇▁
19 V18                   0             1  9.76e-16     0.838   -9.50    -0.499     -0.00364      0.501       5.04 ▁▁▂▇▁
20 V19                   0             1  1.04e-15     0.814   -7.21    -0.456      0.00373      0.459       5.59 ▁▁▇▂▁
21 V20                   0             1  6.41e-16     0.771  -54.5     -0.212     -0.0625       0.133      39.4  ▁▁▇▁▁
22 V21                   0             1  1.69e-16     0.735  -34.8     -0.228     -0.0295       0.186      27.2  ▁▁▇▁▁
23 V22                   0             1 -3.38e-16     0.726  -10.9     -0.542      0.00678      0.529      10.5  ▁▁▇▁▁
24 V23                   0             1  2.67e-16     0.624  -44.8     -0.162     -0.0112       0.148      22.5  ▁▁▁▇▁
25 V24                   0             1  4.47e-15     0.606   -2.84    -0.355      0.0410       0.440       4.58 ▁▇▆▁▁
26 V25                   0             1  5.11e-16     0.521  -10.3     -0.317      0.0166       0.351       7.52 ▁▁▇▂▁
27 V26                   0             1  1.68e-15     0.482   -2.60    -0.327     -0.0521       0.241       3.52 ▁▆▇▁▁
28 V27                   0             1 -3.67e-16     0.404  -22.6     -0.0708     0.00134      0.0910     31.6  ▁▁▇▁▁
29 V28                   0             1 -1.23e-16     0.330  -15.4     -0.0530     0.0112       0.0783     33.8  ▁▇▁▁▁
30 log_amount            0             1  3.15e+ 0     1.66     0        1.89       3.14         4.36       10.2  ▅▇▅▁▁
for (i in 1:ncol(credit)){
  print(summary(credit[, i]))
}
      Time       
 Min.   :     0  
 1st Qu.: 54202  
 Median : 84692  
 Mean   : 94814  
 3rd Qu.:139320  
 Max.   :172792  
       V1           
 Min.   :-56.40751  
 1st Qu.: -0.92037  
 Median :  0.01811  
 Mean   :  0.00000  
 3rd Qu.:  1.31564  
 Max.   :  2.45493  
       V2           
 Min.   :-72.71573  
 1st Qu.: -0.59855  
 Median :  0.06549  
 Mean   :  0.00000  
 3rd Qu.:  0.80372  
 Max.   : 22.05773  
       V3          
 Min.   :-48.3256  
 1st Qu.: -0.8904  
 Median :  0.1799  
 Mean   :  0.0000  
 3rd Qu.:  1.0272  
 Max.   :  9.3826  
       V4          
 Min.   :-5.68317  
 1st Qu.:-0.84864  
 Median :-0.01985  
 Mean   : 0.00000  
 3rd Qu.: 0.74334  
 Max.   :16.87534  
       V5            
 Min.   :-113.74331  
 1st Qu.:  -0.69160  
 Median :  -0.05434  
 Mean   :   0.00000  
 3rd Qu.:   0.61193  
 Max.   :  34.80167  
       V6          
 Min.   :-26.1605  
 1st Qu.: -0.7683  
 Median : -0.2742  
 Mean   :  0.0000  
 3rd Qu.:  0.3986  
 Max.   : 73.3016  
       V7          
 Min.   :-43.5572  
 1st Qu.: -0.5541  
 Median :  0.0401  
 Mean   :  0.0000  
 3rd Qu.:  0.5704  
 Max.   :120.5895  
       V8           
 Min.   :-73.21672  
 1st Qu.: -0.20863  
 Median :  0.02236  
 Mean   :  0.00000  
 3rd Qu.:  0.32735  
 Max.   : 20.00721  
       V9           
 Min.   :-13.43407  
 1st Qu.: -0.64310  
 Median : -0.05143  
 Mean   :  0.00000  
 3rd Qu.:  0.59714  
 Max.   : 15.59500  
      V10           
 Min.   :-24.58826  
 1st Qu.: -0.53543  
 Median : -0.09292  
 Mean   :  0.00000  
 3rd Qu.:  0.45392  
 Max.   : 23.74514  
      V11          
 Min.   :-4.79747  
 1st Qu.:-0.76249  
 Median :-0.03276  
 Mean   : 0.00000  
 3rd Qu.: 0.73959  
 Max.   :12.01891  
      V12          
 Min.   :-18.6837  
 1st Qu.: -0.4056  
 Median :  0.1400  
 Mean   :  0.0000  
 3rd Qu.:  0.6182  
 Max.   :  7.8484  
      V13          
 Min.   :-5.79188  
 1st Qu.:-0.64854  
 Median :-0.01357  
 Mean   : 0.00000  
 3rd Qu.: 0.66251  
 Max.   : 7.12688  
      V14          
 Min.   :-19.2143  
 1st Qu.: -0.4256  
 Median :  0.0506  
 Mean   :  0.0000  
 3rd Qu.:  0.4931  
 Max.   : 10.5268  
      V15          
 Min.   :-4.49894  
 1st Qu.:-0.58288  
 Median : 0.04807  
 Mean   : 0.00000  
 3rd Qu.: 0.64882  
 Max.   : 8.87774  
      V16           
 Min.   :-14.12985  
 1st Qu.: -0.46804  
 Median :  0.06641  
 Mean   :  0.00000  
 3rd Qu.:  0.52330  
 Max.   : 17.31511  
      V17           
 Min.   :-25.16280  
 1st Qu.: -0.48375  
 Median : -0.06568  
 Mean   :  0.00000  
 3rd Qu.:  0.39968  
 Max.   :  9.25353  
      V18           
 Min.   :-9.498746  
 1st Qu.:-0.498850  
 Median :-0.003636  
 Mean   : 0.000000  
 3rd Qu.: 0.500807  
 Max.   : 5.041069  
      V19           
 Min.   :-7.213527  
 1st Qu.:-0.456299  
 Median : 0.003735  
 Mean   : 0.000000  
 3rd Qu.: 0.458949  
 Max.   : 5.591971  
      V20           
 Min.   :-54.49772  
 1st Qu.: -0.21172  
 Median : -0.06248  
 Mean   :  0.00000  
 3rd Qu.:  0.13304  
 Max.   : 39.42090  
      V21           
 Min.   :-34.83038  
 1st Qu.: -0.22839  
 Median : -0.02945  
 Mean   :  0.00000  
 3rd Qu.:  0.18638  
 Max.   : 27.20284  
      V22            
 Min.   :-10.933144  
 1st Qu.: -0.542350  
 Median :  0.006782  
 Mean   :  0.000000  
 3rd Qu.:  0.528554  
 Max.   : 10.503090  
      V23           
 Min.   :-44.80774  
 1st Qu.: -0.16185  
 Median : -0.01119  
 Mean   :  0.00000  
 3rd Qu.:  0.14764  
 Max.   : 22.52841  
      V24          
 Min.   :-2.83663  
 1st Qu.:-0.35459  
 Median : 0.04098  
 Mean   : 0.00000  
 3rd Qu.: 0.43953  
 Max.   : 4.58455  
      V25           
 Min.   :-10.29540  
 1st Qu.: -0.31715  
 Median :  0.01659  
 Mean   :  0.00000  
 3rd Qu.:  0.35072  
 Max.   :  7.51959  
      V26          
 Min.   :-2.60455  
 1st Qu.:-0.32698  
 Median :-0.05214  
 Mean   : 0.00000  
 3rd Qu.: 0.24095  
 Max.   : 3.51735  
      V27            
 Min.   :-22.565679  
 1st Qu.: -0.070840  
 Median :  0.001342  
 Mean   :  0.000000  
 3rd Qu.:  0.091045  
 Max.   : 31.612198  
      V28           
 Min.   :-15.43008  
 1st Qu.: -0.05296  
 Median :  0.01124  
 Mean   :  0.00000  
 3rd Qu.:  0.07828  
 Max.   : 33.84781  
     Amount        
 Min.   :    0.00  
 1st Qu.:    5.60  
 Median :   22.00  
 Mean   :   88.35  
 3rd Qu.:   77.17  
 Max.   :25691.16  
     Class         
 Min.   :0.000000  
 1st Qu.:0.000000  
 Median :0.000000  
 Mean   :0.001728  
 3rd Qu.:0.000000  
 Max.   :1.000000  

Distributions

for (i in names(credit[, -31])) {
  p <- ggplot(credit, aes_string(x = i)) +
    geom_density(fill = "cornsilk")

  print(p)
}

Relationship with Response

ggplot(data = credit, aes(x = Time, fill = Class)) +
  geom_histogram() +
  facet_wrap(~Class, scales = "free")

ggplot(data = credit, aes(x = log(Amount), fill = Class)) +
  geom_histogram() +
  facet_wrap(~Class, scales = "free")

ggplot(data = credit, aes(x = Time, y = log(Amount), alpha = 0.2)) +
  geom_point() +
  facet_wrap(~Class, scales = "free")

There is a clear skew to the Amount variable, so it is worth applying a transformation to the data. As there are values of 0, we need to add 1 to ensure that we don’t get Inf values produced after log transformation.

credit <- credit %>%
  mutate(log_amount = log(Amount + 1))

summary(credit$log_amount)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   1.887   3.135   3.152   4.359  10.154 
ggplot(credit, aes(x = log_amount)) +
    geom_density(fill = "cornsilk")

Predictor Correlations

corrplot(cor(credit[, -31]), method = "square", type = "upper")

Transformation of Amount has helped to reduce collinearities of the predictors. The only correlations are between V3 and Time, and V2 and log_amount. This is as expected as PCA produces orthogonal linear combinations, therefore there shouldn’t be much correlation between them.

LS0tCnRpdGxlOiAiU1RBVCA1MDggRmluYWwgUHJvamVjdCBFREEiCmF1dGhvcjogIkNhbGx1bSBBcm5vbGQiCm91dHB1dDoKICAgIGh0bWxfbm90ZWJvb2s6CiAgICAgICAgY29kZV9mb2xkaW5nOiBoaWRlCiAgICAgICAgdG9jOiB5ZXMKICAgICAgICB0b2NfZmxvYXQ6IHllcwotLS0KCiMgU2V0IFVwCgpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFLCB3YXJuaW5nID0gRkFMU0UsIG1lc3NhZ2UgPSBGQUxTRSwgZmlnLndpZHRoID0gMTQpCmBgYAoKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KGhlcmUpCmxpYnJhcnkoaHJicnRoZW1lcykKbGlicmFyeShqYW5pdG9yKQpsaWJyYXJ5KGNvcnJwbG90KQpsaWJyYXJ5KHNraW1yKQoKUk5Ha2luZChzYW1wbGUua2luZCA9ICJSb3VuZGluZyIpCnNldC5zZWVkKDEpCgp0aGVtZV9zZXQodGhlbWVfaXBzdW0oKSkKYGBgCgpgYGB7cn0KY3JlZGl0IDwtIGFzX3RpYmJsZShyZWFkX2NzdihoZXJlKCJkYXRhIiwgImNyZWRpdGNhcmQuY3N2IikpKQpgYGAKCiMgRURBCgpgYGB7cn0KaGVhZChjcmVkaXQpCmBgYAoKIyMgVmFyaWFibGUgU3VtbWFyeQoKYGBge3J9CmFueU5BKGNyZWRpdCkKYGBgCgpgYGB7cn0Kc2tpbShjcmVkaXQpCmBgYAoKCmBgYHtyfQpmb3IgKGkgaW4gMTpuY29sKGNyZWRpdCkpewogIHByaW50KHN1bW1hcnkoY3JlZGl0WywgaV0pKQp9CmBgYAoKIyMjIERpc3RyaWJ1dGlvbnMKCmBgYHtyfQpmb3IgKGkgaW4gbmFtZXMoY3JlZGl0WywgLTMxXSkpIHsKICBwIDwtIGdncGxvdChjcmVkaXQsIGFlc19zdHJpbmcoeCA9IGkpKSArCiAgICBnZW9tX2RlbnNpdHkoZmlsbCA9ICJjb3Juc2lsayIpCgogIHByaW50KHApCn0KYGBgCgojIyBSZWxhdGlvbnNoaXAgd2l0aCBSZXNwb25zZQoKYGBge3J9CmdncGxvdChkYXRhID0gY3JlZGl0LCBhZXMoeCA9IFRpbWUsIGZpbGwgPSBDbGFzcykpICsKICBnZW9tX2hpc3RvZ3JhbSgpICsKICBmYWNldF93cmFwKH5DbGFzcywgc2NhbGVzID0gImZyZWUiKQpgYGAKCmBgYHtyfQpnZ3Bsb3QoZGF0YSA9IGNyZWRpdCwgYWVzKHggPSBsb2coQW1vdW50KSwgZmlsbCA9IENsYXNzKSkgKwogIGdlb21faGlzdG9ncmFtKCkgKwogIGZhY2V0X3dyYXAofkNsYXNzLCBzY2FsZXMgPSAiZnJlZSIpCmBgYAoKYGBge3J9CmdncGxvdChkYXRhID0gY3JlZGl0LCBhZXMoeCA9IFRpbWUsIHkgPSBsb2coQW1vdW50KSwgYWxwaGEgPSAwLjIpKSArCiAgZ2VvbV9wb2ludCgpICsKICBmYWNldF93cmFwKH5DbGFzcywgc2NhbGVzID0gImZyZWUiKQpgYGAKVGhlcmUgaXMgYSBjbGVhciBza2V3IHRvIHRoZSBgQW1vdW50YCB2YXJpYWJsZSwgc28gaXQgaXMgd29ydGggYXBwbHlpbmcgYSB0cmFuc2Zvcm1hdGlvbiB0byB0aGUgZGF0YS4gCkFzIHRoZXJlIGFyZSB2YWx1ZXMgb2YgYDBgLCB3ZSBuZWVkIHRvIGFkZCBgMWAgdG8gZW5zdXJlIHRoYXQgd2UgZG9uJ3QgZ2V0IGBJbmZgIHZhbHVlcyBwcm9kdWNlZCBhZnRlciBsb2cgdHJhbnNmb3JtYXRpb24uCgpgYGB7cn0KY3JlZGl0IDwtIGNyZWRpdCAlPiUKICBtdXRhdGUobG9nX2Ftb3VudCA9IGxvZyhBbW91bnQgKyAxKSkKCnN1bW1hcnkoY3JlZGl0JGxvZ19hbW91bnQpCmBgYAoKYGBge3J9CmdncGxvdChjcmVkaXQsIGFlcyh4ID0gbG9nX2Ftb3VudCkpICsKICAgIGdlb21fZGVuc2l0eShmaWxsID0gImNvcm5zaWxrIikKYGBgCgojIyBQcmVkaWN0b3IgQ29ycmVsYXRpb25zCgpgYGB7cn0KY29ycnBsb3QoY29yKGNyZWRpdFssIC0zMV0pLCBtZXRob2QgPSAic3F1YXJlIiwgdHlwZSA9ICJ1cHBlciIpCmBgYAoKVHJhbnNmb3JtYXRpb24gb2YgYEFtb3VudGAgaGFzIGhlbHBlZCB0byByZWR1Y2UgY29sbGluZWFyaXRpZXMgb2YgdGhlIHByZWRpY3RvcnMuClRoZSBvbmx5IGNvcnJlbGF0aW9ucyBhcmUgYmV0d2VlbiBgVjNgIGFuZCBgVGltZWAsIGFuZCBgVjJgIGFuZCBgbG9nX2Ftb3VudGAuClRoaXMgaXMgYXMgZXhwZWN0ZWQgYXMgUENBIHByb2R1Y2VzIG9ydGhvZ29uYWwgbGluZWFyIGNvbWJpbmF0aW9ucywKdGhlcmVmb3JlIHRoZXJlIHNob3VsZG4ndCBiZSBtdWNoIGNvcnJlbGF0aW9uIGJldHdlZW4gdGhlbS4KCg==